#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)

DF1 - EMPLOYMENT

#Abhay
#Employment - K202301 for 2021

# Get ACS data
df1 <- get_acs(geography = "state", 
               table = "K202301", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
  variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
  label = c("Total", "In labor force:", "Civilian labor force:", "Employed", 
            "Unemployed", "In Armed Forces", 
            "Not in labor force")
)



# Joining the data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
  left_join(variables_df1, by = "variable")

# Reshapeing the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# Calculate employment rates
df1_wide <- df1_wide %>%
  mutate(
    EmploymentRate = `Employed` / `Total` * 100,
    UnemploymentRate = `Unemployed` / `Total` * 100,
    NotInLaborForceRate = `Not in labor force` / `Total` * 100
  )

# Plotting employment rates
library(ggplot2)

ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Plotting labor force participation
ggplot(df1_wide, aes(x = NAME)) +
  geom_bar(aes(y = `Employed`, fill = "Employed"), stat = "identity") +
  geom_bar(aes(y = `Unemployed`, fill = "Unemployed"), stat = "identity") +
  geom_bar(aes(y = `In Armed Forces`, fill = "Armed Forces"), stat = "identity") +
  scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")

ggplot(df1_wide, aes(x = UnemploymentRate)) +
  geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
  labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")

# Correlation analysis
employment_data <- df1_wide %>%
  select(`Employed`, `Unemployed`, `In Armed Forces`, `Not in labor force`)

correlation_matrix <- cor(employment_data, use = "complete.obs")

# Visualize the correlation matrix
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlation_matrix, method = "circle")

# Convert state names to abbreviations
state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
df1_wide$state <- state_abbreviations

df1_wide$EmploymentRate <- df1_wide$`Employed` / df1_wide$Total * 100

# Plot the map with employment rate
plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
  scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
  theme(legend.position = "right") +
  labs(title = "Employment Rate by State in 2021")



DF 2 - Education

#Neha
#Education - K201501
df2 <- get_acs(geography = "state", 
                table = "K201501", 
                year = 2021, 
                survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
  variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
  label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associates_degree", "Education_Bachelors_degree", "Education_Graduate_professional degree")
)

df2_labeled <- df2 %>%
  left_join(variables_df2, by = "variable")

df2_wide <- df2_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

Exploratory Data Analysis (EDA)

# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
library(ggplot2)
library(tidyr)

# Convert df2_wide to long format
df2_long <- df2_wide %>%
  gather(key = "Education_Level", value = "Count", -NAME)

# Plotting the distribution of each education level across states
ggplot(df2_long, aes(x = NAME, y = Count, fill = Education_Level)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Distribution of Education Levels across States", 
       x = "State", 
       y = "Count")

Exploratory Data Analysis (EDA) based on percentage

# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
if (!require("dplyr")) install.packages("dplyr")
library(ggplot2)
library(tidyr)
library(dplyr)

# Convert df2_wide to long format
df2_long <- df2_wide %>%
  gather(key = "Education_Level", value = "Count", -NAME)

# Calculate total count for each state
df2_long <- df2_long %>%
  group_by(NAME) %>%
  mutate(Total = sum(Count)) %>%
  ungroup()

# Calculate the percentage
df2_long <- df2_long %>%
  mutate(Percentage = (Count / Total) * 100)

# Plotting the distribution of each education level across states as a percentage
ggplot(df2_long, aes(x = NAME, y = Percentage, fill = Education_Level)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Percentage Distribution of Education Levels across States", 
       x = "State", 
       y = "Percentage")

library(ggplot2)
library(tidyr)
library(dplyr)

# Convert df2_wide to long format
df2_long <- df2_wide %>%
  gather(key = "Education_Level", value = "Count", -NAME)

# Calculate total count for each state only once
total_counts <- df2_long %>%
  group_by(NAME) %>%
  summarize(Total = sum(Count)) 

# Join with original data to add the Total column
df2_long <- df2_long %>%
  left_join(total_counts, by = "NAME")

# Calculate the percentage for each education level
df2_long <- df2_long %>%
  mutate(Percentage = (Count / Total) * 100)

# Create graphs for each educational level
unique_levels <- unique(df2_long$Education_Level)
plots <- list()

for (level in unique_levels) {
  df_subset <- filter(df2_long, Education_Level == level)

  p <- ggplot(df_subset, aes(x = NAME, y = Percentage, fill = Education_Level)) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(title = paste("Percentage Distribution of", level, "across States"), 
         x = "State", 
         y = "Percentage") +
    guides(fill = guide_legend(title = "Education Level"))

  plots[[level]] <- p
}

# Print plots (optional)
for (plot in plots) {
  print(plot)
}

df2_percentages <- df2_wide %>%
  mutate(
    `Education_Below_9th grade_Percentage` = `Education_Below_9th grade` / Education_Total_students * 100,
    `Education_9th to 12th grade_no diploma_Percentage` = `Education_9th to 12th grade_no diploma` / Education_Total_students * 100,
    Education_High_school_graduate_Percentage = Education_High_school_graduate / Education_Total_students * 100,
    `Education_Some college_no degree_Percentage` = `Education_Some college_no degree` / Education_Total_students * 100,
    Education_Associates_degree_Percentage = Education_Associates_degree / Education_Total_students * 100,
    Education_Bachelors_degree_Percentage = Education_Bachelors_degree / Education_Total_students * 100,
    `Education_Graduate_professional degree_Percentage` = `Education_Graduate_professional degree` / Education_Total_students * 100
  )
percentage_columns <- grep("_Percentage$", names(df2_percentages), value = TRUE)
df2_long <- df2_percentages %>%
  pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(percentage_columns)
## 
##   # Now:
##   data %>% select(all_of(percentage_columns))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(df2_long, aes(x = reorder(NAME, -Education_Total_students), y = Percentage, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Age Distribution by State",
       x = "State",
       y = "Percentage of Total Population",
       fill = "Age Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis_d()  # Adjust the color scale as needed



DF3 - Citizenship

#Esha
#Citizenship - K200501

df3<- get_acs(
  geography = "state", 
  table = "K200501", 
  year = 2021, 
  survey = "acs1/subject", 
  cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
  variable = c("K200501_001", "K200501_002","K200501_003"),  
  label = c("Total","U.S. citizen", "Not a U.S. citizen")  
)

df3_labeled <- df3 %>%
  left_join(variables_df3, by = "variable")

df3_wide <- df3_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

# Load necessary libraries
library(ggplot2)
library(tidyr)
library(dplyr)

# Convert df3_wide to long format
df3_long <- df3_wide %>%
  gather(key = "Citizenship", value = "Count", -NAME)

# Calculate total count for each state
df3_long <- df3_long %>%
  group_by(NAME) %>%
  mutate(Total = sum(Count)) %>%
  ungroup()

# Calculate the percentage
df3_long <- df3_long %>%
  mutate(Percentage = (Count / Total) * 100)

# Iterate over each unique citizenship category
for (category in unique(df3_long$Citizenship)) {
  # Filter data for the current citizenship category
  df3_subset <- df3_long %>% 
    filter(Citizenship == category)

  # Generate and print the plot for citizenship
  p <- ggplot(df3_subset, aes(x = NAME, y = Percentage, fill = Citizenship)) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(title = paste("Percentage Distribution of", category, "across States"), 
         x = "State", 
         y = "Percentage")

  print(p)
}
## Warning: Removed 1 rows containing missing values (`position_stack()`).

## Warning: Removed 1 rows containing missing values (`position_stack()`).

## Warning: Removed 1 rows containing missing values (`position_stack()`).



DF4 - Age

#Srika
#Age - K200104
df4 <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021,
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
  variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),

  label = c("Total_age", "Age_under_18", 
            "Age_18_to_24", "Age_25_to_34", 
            "Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)

df4_labeled <- df4 %>%
  left_join(variables_df4, by = "variable")

df4_wide <- df4_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df4_percentages <- df4_wide %>%
  mutate(
    Age_under_18_Percentage = Age_under_18 / Total_age * 100,
    Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
    Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
    Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
    Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
    Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
    Age_over_64_Percentage = Age_over_64 / Total_age * 100
  )
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)
df4_long <- df4_percentages %>%
  pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Age Distribution by State",
       x = "State",
       y = "Percentage of Total Population",
       fill = "Age Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis_d()  # Adjust the color scale as needed

# Load necessary libraries if not already loaded
library(ggplot2)
library(tidyr)

# Calculate percentages for each age group
df4_percentages <- df4_wide %>%
  mutate(
    Age_under_18_Percentage = Age_under_18 / Total_age * 100,
    Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
    Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
    Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
    Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
    Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
    Age_over_64_Percentage = Age_over_64 / Total_age * 100
  )

# Filter columns based on names ending with "Percentage"
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)

# Reshape the data to a longer format
df4_long <- df4_percentages %>%
  pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")

# Create a facet grid for each age group
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(Age_Group ~ ., scales = "free_y") +
  labs(title = "Age Distribution by State",
       x = "State",
       y = "Percentage of Total Population",
       fill = "Age Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_fill_viridis_d()  # Adjust the color scale as needed

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state", 
                table = "K200104", 
                year = 2021, 
               geometry = TRUE,
               resolution = "20m",
                survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
Mapping_age<-Age_data%>%
  filter(variable=="K200104_007")%>%
  shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) + 
  geom_sf()+
  scale_fill_distiller(palette = "RdPu", 
                       direction = 1) + 
  labs(title = "Median Age by State, 2021",
       caption = "Data source: 2021 1-year ACS, US Census Bureau",
       fill = "ACS estimate") + 
  theme_void()



DF5 - Housing

#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state", 
                table = "K202502", 
                year = 2021, 
                survey = "acs1", # removed /subject since we're referring to a specific table ID
                cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
  variable = c("K202502_001", "K202502_002", "K202502_003"),
  label = c("Total", "Owner Occupied", "Renter Occupied")
)

# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
  left_join(variables_df5, by = "variable")  # Joining with the variable labels dataframe

# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)

# df5_wide will have states as rows and different housing-related variables as columns
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# Assuming df5_wide is your dataset for housing
# Melt the data to long format for easy plotting
df5_long <- tidyr::gather(df5_wide, key = "Housing_Category", value = "Count", -NAME)

# Calculate the percentage for Owner and Renter Occupied categories
df5_long <- df5_long %>%
  group_by(NAME) %>%
  mutate(Total = sum(Count[which(Housing_Category == "Total")])) %>%
  ungroup() %>%
  mutate(Percentage = ifelse(Housing_Category != "Total", Count / Total * 100, Count))

# Define colors for each category
category_colors <- c("Owner Occupied" = "steelblue", "Renter Occupied" = "salmon", "Total" = "lightgreen")

# Function to create a bar plot
create_bar_plot <- function(data, category, title) {
  ggplot(data[data$Housing_Category == category, ], aes(x = reorder(NAME, -Percentage), y = Percentage, fill = Housing_Category)) +
    geom_bar(stat = "identity", position = position_dodge()) +
    labs(x = "State", y = "Percentage", title = title) +
    scale_y_continuous(labels = percent_format(scale = 1)) +
    scale_fill_manual(values = category_colors) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}

# Plot for Owner Occupied
owner_plot <- create_bar_plot(df5_long, "Owner Occupied", "Percentage of Owner Occupied Housing by State")
print(owner_plot)

# Plot for Renter Occupied
renter_plot <- create_bar_plot(df5_long, "Renter Occupied", "Percentage of Renter Occupied Housing by State")
print(renter_plot)

#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
library(tidyr)

#install.packages("factoextra")
library(factoextra)

#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)

DF6 - Disabilities

#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state", 
               table = "K201803", 
               year = 2021, 
               survey = "acs1-year", 
               cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates.  Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
  variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
  label = c("Total_people", "Total With Disabilities", 
            "Hearing", "Vision difficulty", 
            "cognative", "ambulatory difficulty", 
            "Self-care difficulty","Independent living difficulty","No Disability")
)

# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
  left_join(variables_df6, by = "variable")

# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
  pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
##    NAME          Total_people Total With Disabilit…¹ Hearing `Vision difficulty`
##    <chr>                <dbl>                  <dbl>   <dbl>               <dbl>
##  1 Alabama            4957633                 808071  208028              152798
##  2 Alaska              702154                  92390   33397               15748
##  3 Arizona            7174053                 972252  298849              180792
##  4 Arkansas           2974701                 517051  142133              105624
##  5 California        38724294                4324355 1140131              844049
##  6 Colorado           5715497                 640346  211803              120570
##  7 Connecticut        3557526                 427014  113490               78078
##  8 Delaware            987964                 130551   37933               25335
##  9 District of …       659979                  76754   14429               14569
## 10 Florida           21465883                2906367  812248              555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹​`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## #   `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## #   `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
  mutate(DisabilityRate = `Total With Disabilities` / `Total_people` * 100)

# Plotting with ggplot2
library(ggplot2)

ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
  select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs")  # use = "complete.obs" handles missing values

# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")

library(corrplot)
corrplot(correlation_matrix, method = "circle")

# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
  filter(NAME == "Iowa") %>%
 select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)

# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))

# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)

# Summary of the linear model
summary(lm_model)
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56621 -11941  -2766  17524  73868 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -8200.5856  5646.4977  -1.452 0.153197    
## Hearing                             0.6796     0.1793   3.791 0.000436 ***
## `Vision difficulty`                 1.0070     0.1644   6.126 1.87e-07 ***
## cognative                          -0.5426     0.2291  -2.369 0.022116 *  
## `Self-care difficulty`             -1.9464     0.4434  -4.390 6.59e-05 ***
## `Independent living difficulty`     1.9228     0.3144   6.115 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared:  0.9968, Adjusted R-squared:  0.9965 
## F-statistic:  2909 on 5 and 46 DF,  p-value: < 2.2e-16
lm_model
## 
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` + 
##     cognative + `Self-care difficulty` + `Independent living difficulty`, 
##     data = df6_wide)
## 
## Coefficients:
##                     (Intercept)                          Hearing  
##                      -8200.5856                           0.6796  
##             `Vision difficulty`                        cognative  
##                          1.0070                          -0.5426  
##          `Self-care difficulty`  `Independent living difficulty`  
##                         -1.9464                           1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.


# K-means clustering with 3 centers
set.seed(123)  # For reproducibility
clusters <- kmeans(disability_data, centers = 3)

# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)

pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)



Part 1

Title

“Socio-Economic Factors Influencing Employment in the United States: A Comprehensive State-by-State Analysis”

Authors

  1. Abhay Prasanna Rao

  2. Srika Raja

  3. Neha

  4. Esha

  5. Niharika

Abstract (TL;DR)

This project investigates the impact of socio-economic factors on employment rates across U.S. states. Utilizing ACS 2021 data, we explore relationships between employment and variables like education, citizenship, and housing. Key findings include significant correlations that inform employment dynamics in the U.S.

Motivation

We aim to analyze various socio-economic factors influencing employment in the U.S. This study is crucial for understanding how different aspects like education, age, and housing contribute to employment rates, thereby aiding policymakers and researchers.

Summary

We have imported data set from the ACS survey. We have 6 child RMD files for this project which has the data analysis for the Employment, Education, Citizenship, Age, Housing, Disabilities Data Set (ACS 2021).
Further, we started exploring each data set in detail and then we started combining each data set with the employment to see what results we can expect. We did find many direct relationships with each data set on employment data set. We have put our concluding results in the Final Report to help us stand by with our conclusions.

Part 2:

Data Sets

The below data sets are from data.census.gov [ United States Census Bureau]. We shortlisted it based on ACS 2021, inclusive for all states in United States.

  1. Employment - K202301

  2. Education - K201501

  3. Citizenship - K200501

  4. Age - K200104

  5. Housing - K202502

  6. Disabilities - K201803